Download Packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggthemes)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(nycflights13)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(anyflights)
library(ggmap)
## ℹ Google's Terms of Service: ]8;;https://mapsplatform.google.com<https://mapsplatform.google.com>]8;;
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind

Set Up Data

COVID-19 ravaged New York in 2020, with the entire city and metropolitan area living under tight quarantine controls to flatten the COVID curve as mortalities rose. 19 years previous, the loss of the World Trade Center due to terrorist attacks similarly left the entirety of New York stunned and in fear. Both of these major world events caused a major airline recession, with flights being cancelled and delayed as fear of planes meant air travel was used mostly for serious reasons.

COVID 19 and 9/11 are two major events that impacted the travel industry. We would like to investigate how each event altered flight travel, and how NYC travel was able to recover after these events.

To investigate cancellations and departure delays, we decided to choose only airports in NYC for years:

aline <- get_airlines()
aport<- get_airports()

JFK2020<-get_flights(“JFK”, 2020)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

JFK2013<-get_flights(“JFK”, 2013)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

JFK2001<-get_flights(“JFK”, 2001)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

LGA2020<-get_flights(“LGA”, 2020)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

LGA2013<-get_flights(“LGA”, 2013)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

LGA2001<-get_flights(“LGA”, 2001)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

EWR2020<-get_flights(“EWR”, 2020)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

EWR2013<-get_flights(“EWR”, 2013)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

EWR2001<-get_flights(“EWR”, 2001)%>% inner_join(aline,by=“carrier”) %>% inner_join(aport,by=c(“origin”=“faa”)) %>% inner_join(aport, by=c(“dest”=“faa”), suffix=c(“_origin”,“_dest”))

df <- rbind(LGA2001,JFK2001,EWR2001,LGA2013,JFK2013,EWR2013,LGA2020,JFK2020,EWR2020)

write.csv(df, file=“finalflights.csv”)

df <- read.csv("finalflights.csv")
head(df)
##   X year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 1 2001     1   1       18           1947       271      308           2331
## 2 2 2001     1   1      542            534         8      825            836
## 3 3 2001     1   1      557            600        -3      655            725
## 4 4 2001     1   1      605            600         5      830            819
## 5 5 2001     1   1      606            600         6      740            755
## 6 6 2001     1   1      607            600         7      715            726
##   arr_delay carrier flight   tailnum origin dest air_time distance hour minute
## 1       217      US    309 N287A\xe4    LGA  PBI      151     1035   19     47
## 2       -11      AA    115    N429A1    LGA  DFW      201     1389    5     34
## 3       -30      UA    113 N359\xe41    LGA  ORD      105      733    6      0
## 4        11      DL    281    N542D1    LGA  ATL      105      761    6      0
## 5       -15      NW    143 N354N\xe6    LGA  MEM      137      963    6      0
## 6       -11      AA     17    N518A1    LGA  ORD      103      733    6      0
##             time_hour                  name.x             name.y lat_origin
## 1 2001-01-01 19:00:00         US Airways Inc. La Guardia Airport    40.7772
## 2 2001-01-01 05:00:00  American Airlines Inc. La Guardia Airport    40.7772
## 3 2001-01-01 06:00:00   United Air Lines Inc. La Guardia Airport    40.7772
## 4 2001-01-01 06:00:00    Delta Air Lines Inc. La Guardia Airport    40.7772
## 5 2001-01-01 06:00:00 Northwest Airlines Inc. La Guardia Airport    40.7772
## 6 2001-01-01 06:00:00  American Airlines Inc. La Guardia Airport    40.7772
##   lon_origin alt_origin tz_origin dst_origin     tzone_origin
## 1   -73.8726         21        -5          A America/New_York
## 2   -73.8726         21        -5          A America/New_York
## 3   -73.8726         21        -5          A America/New_York
## 4   -73.8726         21        -5          A America/New_York
## 5   -73.8726         21        -5          A America/New_York
## 6   -73.8726         21        -5          A America/New_York
##                                               name lat_dest lon_dest alt_dest
## 1                 Palm Beach International Airport  26.6832 -80.0956       19
## 2          Dallas Fort Worth International Airport  32.8968 -97.0380      607
## 3             Chicago O'Hare International Airport  41.9786 -87.9048      672
## 4 Hartsfield Jackson Atlanta International Airport  33.6367 -84.4281     1026
## 5                    Memphis International Airport  35.0424 -89.9767      341
## 6             Chicago O'Hare International Airport  41.9786 -87.9048      672
##   tz_dest dst_dest       tzone_dest
## 1      -5        A America/New_York
## 2      -6        A  America/Chicago
## 3      -6        A  America/Chicago
## 4      -5        A America/New_York
## 5      -6        A  America/Chicago
## 6      -6        A  America/Chicago

Map

First, we wanted to see the locations of destinations most traveled to originating from the NYC metropolitan area. To do this, we find volumes listed at each destination, and then overlay them with sizes corresponding over a US Map. 

# Load the airport locations from the `airports` dataset
airport_locs <- aport %>%
  select(lon, lat, faa, tz) %>%
  rename(longitude = lon, latitude = lat)

# Join the flights dataset with the airport locations dataset
flights_with_locs <- df %>%
  left_join(airport_locs, by = c("dest" = "faa"))

# Group the data by the destination airport and count the number of flights to each airport
flight_volume <- flights_with_locs %>%
  group_by(dest, longitude, latitude) %>%
  summarize(volume = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'dest', 'longitude'. You can override using
## the `.groups` argument.
# Get a map of the United States  area
bbox <- c(bottom = 25.75, top = 49 , right = -67, left = -125)
usmap <- get_stamenmap(bbox = bbox, zoom = 4, maptype = 'toner-lite') 
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
# Create a scatter plot of the data, with the size of the points representing flight volume
map <- ggmap(usmap) +
  geom_point(data = flight_volume, aes(x = longitude, y = latitude, size = volume), alpha = 0.5) +
  scale_size(range = c(2, 10)) +
  labs(title = "Flight Volume to US Destination Airports",
       subtitle = "Size of points represents flight volume",
       size = "Flight Volume") 

map
## Warning: Removed 3 rows containing missing values (geom_point).

Average Departure Delays Per Month for Each Year

Look for any extreme observations

In order to investigate average departure delay per month, we decided to see the distribution of departure delays per year. There was a large outlier effect considering some flight had delays that were hours long. For the rest of our analysis, we decided to only consider flights that had no more than a 3 hour delay.

plot_ly(df, x = ~dep_delay, type = "box")
## Warning: Ignoring 47282 observations
#Per Year
p2 <- plot_ly(df, x = ~year, y = ~dep_delay) %>%
  add_boxplot(name = "year vs Departure Delay")

p2
## Warning: Ignoring 47282 observations

Analysis

To investigate average delays per month, we first had to seperete the whole dataset by specific years. This is so each mean is calculated according to these years and not based off the whole dataset. As stated previously we also took out any flights that had a departure delay greater than 3 hours. You can see that average departure delays actually decreased during COVID. This means that travel may have been more efficient during COVID. 9/11 stayed around our control group of 2013. Although 2001 did have a lower average departure delay than 2013, it was nowhere near COVID which actually experienced negative average departure delay. You can also see that September 2001 had the highest average departure delay compared to September 2013 and 2020. In all, we can say that 2020 had the lowest average departure delay for every month.

delays2020<- df %>%
  drop_na()%>%
  filter(year==2020) %>%
  filter(dep_delay<180)%>%
  group_by(month)%>%
  summarise(avg_dep_delay=mean(dep_delay, na.rm=TRUE))

delays2013<- df %>%
  drop_na()%>%
  filter(year==2013) %>%
  filter(dep_delay<180)%>%
  group_by(month)%>%
  summarise(avg_dep_delay=mean(dep_delay, na.rm=TRUE))

delays2001<- df %>%
  drop_na()%>%
  filter(year==2001) %>%
  filter(dep_delay<180)%>%
  group_by(month)%>%
  summarise(avg_dep_delay=mean(dep_delay, na.rm=TRUE))


delays<- data.frame(delays2001,delays2013,delays2020)

delaysfig <- plot_ly(delays, x = ~month, y = ~avg_dep_delay, name = '2001', type = 'scatter', mode = 'lines+markers') 

delaysfig <- delaysfig %>% add_trace(x= ~month.1, y = ~avg_dep_delay.1, name = '2013', mode = 'lines+markers') 

delaysfig <- delaysfig %>% add_trace(x= ~month.2, y = ~avg_dep_delay.2, name = '2020', mode = 'lines+markers')

delaysfig <- delaysfig %>% layout(title = "Average Departure Delay",
         xaxis = list(title = "Month"),
         yaxis = list (title = "Minutes"))

delaysfig

Percentage of Cancelled Flights Per Month for Each Year

To find the number of cancellations that normally occur, we examine 2013 data, and then compare these to the years that were affected by disaster shocks. You can see that COVID had cancellations far greater than 9/11.

cancellations2020<- df %>%
  filter(year==2020) %>%
  group_by(month)%>%
  summarise(cancelled=100*mean(is.na(dep_delay)))

cancellations2013<- df %>%
  filter(year==2013) %>%
  group_by(month)%>%
  summarise(cancelled=100*mean(is.na(dep_delay)))

cancellations2001<- df %>%
  filter(year==2001) %>%
  group_by(month)%>%
  summarise(cancelled=100*mean(is.na(dep_delay)))

cancellations<- data.frame(cancellations2001,cancellations2013,cancellations2020)

cancellationsfig <- plot_ly(cancellations, x = ~month, y = ~cancelled, name = '2001', type = 'scatter', mode = 'lines+markers')

cancellationsfig <- cancellationsfig %>% add_trace(x= ~month.1, y = ~cancelled.1, name = '2013', mode = 'lines+markers')

cancellationsfig <- cancellationsfig %>% add_trace(x= ~month.2, y = ~cancelled.2, name = '2020', mode = 'lines+markers')

cancellationsfig <- cancellationsfig %>% layout(title = "Percentage of Cancelled Flights",
         xaxis = list(title = "Month"),
         yaxis = list (title = "Percentage"))

cancellationsfig

Percentage of Cancelled Flights Per Month for Each Year By Airport

We wanted to investigate whether amount of cancellations were varying across airports in NYC. We did this for years 2020 for COVID, 2013 as a control, and 2001 for 9/11. We calculated these cancellations as a proportion for that year so we can disregard the fact that airports may have different flight volumes. Each year, LGA seems to have the most cancellations. You can also see the shocks of cancellations that happened for each event. Using 2013 as our control, we can see that flight cancelations don’t exceed more than 5.5 percent for any airport for that year. This percentage is greatly higher in the years where the shocks happened. However, you can see recovery shortly after the shocks, with percentage of cancelled flights decreasing dramatically.

cancellations2020a<- df %>%
  filter(year==2020) %>%
  group_by(month, origin)%>%
  summarise(cancelled=100*mean(is.na(dep_delay)))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancellations2013a <- df %>%
  filter(year==2013) %>%
  group_by(month, origin)%>%
  summarise(cancelled=100*mean(is.na(dep_delay)))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancellations2001a <- df %>%
  filter(year==2001) %>%
  group_by(month, origin)%>%
  summarise(cancelled=100*mean(is.na(dep_delay)))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancellationsa <- data.frame(cancellations2001a,cancellations2013a,cancellations2020a)

ggplot(cancellationsa, aes(x = month, y= cancelled, color = origin)) +
  geom_line() + labs(title = "Percentage of Flights Cancelled Every Month",
       x = "Month",
       y = "Percentage of Flights")

ggplot(cancellationsa, aes(x = month.1, y= cancelled.1, color = origin.1)) +
  geom_line()+ labs(title = "Percentage of Flights Cancelled Every Month",
       x = "Month",
       y = "Percentage of Flights")

ggplot(cancellationsa, aes(x = month.2, y= cancelled.2, color = origin.2)) +
  geom_line()+ labs(title = "Percentage of Flights Cancelled Every Month",
       x = "Month",
       y = "Percentage of Flights")

Recovery JFK EWR for 9/11 2001-2002

To find out how the NYC airline industry eventually recovered, we took time series data from succeeding years after the shock to find how flight volumes eventually rebounded to pre-shock levels. We chose to only find recovery data for JFK and EWR because the planes involved in 9/11 departed from these airports.

recovery <- read.csv("Downloads/recovery.csv")

recovery2002<- recovery%>%
  drop_na()%>%
  filter(year == 2002) %>%
  group_by(month) %>%
  summarise(volume = n()) 

recovery2001<- df %>%
  filter(origin %in% c("EWR", "JFK")) %>%
  drop_na()%>%
  filter(year == 2001) %>%
  group_by(month) %>%
  summarise(volume = n())

recoveryline<- data.frame(recovery2001,recovery2002)

recoveryfig <- plot_ly(recoveryline, x = ~month, y = ~volume, name = '2001', type = 'scatter', mode = 'lines+markers')

recoveryfig <- recoveryfig %>% add_trace(x= ~month.1, y = ~volume.1, name = '2002', mode = 'lines+markers')

recoveryfig <- recoveryfig %>% layout(title = "Volume of Flights Per Month After Shock",
         xaxis = list(title = "Month"),
         yaxis = list (title = "Volume"))

recoveryfig

Recovery for COVID

To find out how the NYC airline industry eventually recovered, we took time series data from succeeding years after the shock to find how flight volumes eventually rebounded to pre-shock levels.

recovery2022<- recovery %>%
  drop_na()%>%
  filter(year == 2022) %>%
  group_by(month) %>%
  summarise(volume = n()) 

recovery2020<- df %>%
  filter(origin %in% c("EWR", "JFK")) %>%
  drop_na()%>%
  filter(year == 2020) %>%
  group_by(month) %>%
  summarise(volume = n())

recoveryline2 <- data.frame(recovery2020,recovery2022)

recoveryfig2 <- plot_ly(recoveryline2, x = ~month, y = ~volume, name = '2020', type = 'scatter', mode = 'lines+markers')

recoveryfig2 <- recoveryfig2 %>% add_trace(x= ~month.1, y = ~volume.1, name = '2022', mode = 'lines+markers')

recoveryfig2 <- recoveryfig2 %>% layout(title = "Volume of Flights Per Month After Shock",
         xaxis = list(title = "Month"),
         yaxis = list (title = "Volume"))

recoveryfig2